Initial Data Exploration

Prerequisites

The following external packages are used:

  • plotly: plotting
  • pandas: to wrap build dataframes
  • cufflinks: plotly wrapper for plotting with pandas dataframes
In [1]:
import json
from pathlib import Path

# Standard plotly imports
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.offline import iplot

# To create dataframes
import pandas as pd

# Cufflinks wrapper on plotly
import cufflinks
cufflinks.go_offline()

# Set the global theme for cufflinks
cufflinks.set_config_file(world_readable=True, theme='solar', offline=True)

cur_dir = !pwd
project_dir = Path(cur_dir[0]).resolve().parents[0]
In [2]:
data_path = project_dir / 'data' / 'processed'
training_set = json.load(open(data_path / 'training_data.json'))
In [3]:
# Using the sum of the x_counts vector as a feature for visualization
for e in training_set:
    e['x_counts_sum'] = sum(e['x_counts'])
In [4]:
features = [
    'controversiality',
    'children',
    'x_counts_sum',
    'popularity_score',
]

ranges = {
    features[0]: [0]*2,
    features[1]: [0]*2,
    features[2]: [0]*2,
    features[3]: [0]*2,
}

for e in training_set:
    for f in features:
        ranges[f][0] = e[f] if e[f] < ranges[f][0] else ranges[f][0]
        ranges[f][1] = e[f] if e[f] > ranges[f][1] else ranges[f][1]
In [5]:
df = pd.read_json(json.dumps(training_set))
df.sort_values(by=['popularity_score']).head()
Out[5]:
children controversiality is_root length popularity_score stem text x_counts x_counts_sum
42 0 0 1 8 -6.778971 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... Plumbus. [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0
8052 0 0 0 9 -6.266876 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [removed] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0
764 0 0 0 20 -6.057178 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ... OP's mom is a slut. [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ... 2
413 1 0 0 9 -5.787133 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... [deleted] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 1
9299 0 0 1 9 -5.158391 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... [deleted] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 1
In [6]:
data = [
    go.Parcoords(
        line = dict(color = df['popularity_score'],
                    colorscale = [[0,'#6A1B9A'],[0.6,'#E85285'],[1,'#FFECB3']]),
        dimensions = list([
            dict(range = [0,1],
                 label = 'is root',
                 values = df['is_root']
                ),
            dict(range = ranges['children'],
                 label = 'children',
                 values = df['children']
                ),
            dict(range = ranges['controversiality'],
                 label = 'controversiality',
                 values = df['controversiality']
                ),
            #dict(range = ranges['x_counts_sum'],
             #    label = 'top 160 words sum (basic)',
              #   values = df['x_counts_sum']
               # ),
            dict(range = ranges['popularity_score'],
                 label = 'popularity',
                 values = df['popularity_score']
                ),
        ])
    )
]

layout = go.Layout(title='Training data without text features')
training_no_text = go.Figure(data=data, layout=layout)

iplot(training_no_text)

Observations:

  1. Non controversial comments are most popular
  2. Overall, comments with more children are more popular
In [7]:
data = [
    go.Parcoords(
        line = dict(color = df['popularity_score'],
                    colorscale = [[0,'#6A1B9A'],[0.6,'#E85285'],[1,'#FFECB3']]),
        dimensions = list([
            dict(range = [0,1],
                 label = 'is root',
                 values = df['is_root']
                ),
            dict(range = ranges['children'],
                 label = 'children',
                 values = df['children']
                ),
            dict(range = ranges['controversiality'],
                 label = 'controversiality',
                 values = df['controversiality']
                ),
            dict(range = ranges['x_counts_sum'],
                 label = 'top 160 words sum (basic)',
                 values = df['x_counts_sum']
                ),
            dict(range = ranges['popularity_score'],
                 label = 'popularity',
                 values = df['popularity_score']
                ),
        ])
    )
]

layout = go.Layout(title='Training data with top 160 words')

training_top_160_basic = go.Figure(data=data, layout=layout)

iplot(training_top_160_basic)

Observations:
High top-160-word-sum doesn't imply high popularity, which motivates more advanced text features.

In [8]:
# Shifting popularity scores to non negative values
df2 = df.copy()
df2['popularity_score'] += abs(ranges['popularity_score'][0])

df3 = df2[['popularity_score', 'x_counts_sum']].set_index('x_counts_sum')
df3.iplot(
    kind='bar',
    xTitle='x counts sum',
    yTitle='popularity',
    title='Popularity in terms of x counts sum')

Observations:
Similar observation as above for high top-160-word-count sum. There's a sweet spot between 0 and ~150 that hosts the most popular comments.